# Data
import numpy as np
import pandas as pd
# Data Visualization
import plotly.express as px
import matplotlib.pyplot as plt
# Data preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
# Clustering Models
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
d = pd.read_csv('data.csv')
d
| CustomerID | Gender | Age | Annual Income ($) | Spending Score (1-100) | Profession | Work Experience | Family Size | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Male | 19 | 15000 | 39 | Healthcare | 1 | 4 |
| 1 | 2 | Male | 21 | 35000 | 81 | Engineer | 3 | 3 |
| 2 | 3 | Female | 20 | 86000 | 6 | Engineer | 1 | 1 |
| 3 | 4 | Female | 23 | 59000 | 77 | Lawyer | 0 | 2 |
| 4 | 5 | Female | 31 | 38000 | 40 | Entertainment | 2 | 6 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1995 | 1996 | Female | 71 | 184387 | 40 | Artist | 8 | 7 |
| 1996 | 1997 | Female | 91 | 73158 | 32 | Doctor | 7 | 7 |
| 1997 | 1998 | Male | 87 | 90961 | 14 | Healthcare | 9 | 2 |
| 1998 | 1999 | Male | 77 | 182109 | 4 | Executive | 7 | 2 |
| 1999 | 2000 | Male | 90 | 110610 | 52 | Entertainment | 5 | 2 |
2000 rows × 8 columns
d.rename(columns={'Annual Income ($)': 'Income','Spending Score (1-100)':'Score'}, inplace=True)
d = d[['Gender', 'Age', 'Income','Score']]
d.describe()
| Age | Income | Score | |
|---|---|---|---|
| count | 2000.000000 | 2000.000000 | 2000.000000 |
| mean | 48.960000 | 110731.821500 | 50.962500 |
| std | 28.429747 | 45739.536688 | 27.934661 |
| min | 0.000000 | 0.000000 | 0.000000 |
| 25% | 25.000000 | 74572.000000 | 28.000000 |
| 50% | 48.000000 | 110045.000000 | 50.000000 |
| 75% | 73.000000 | 149092.750000 | 75.000000 |
| max | 99.000000 | 189974.000000 | 100.000000 |
d = d[(d['Income'] > 15000) & (d['Age'] > 16) & (d['Score'] > 3)]
d.describe()
| Age | Income | Score | |
|---|---|---|---|
| count | 1596.000000 | 1596.000000 | 1596.000000 |
| mean | 56.921053 | 112080.638471 | 51.760025 |
| std | 24.141070 | 43764.532653 | 26.997632 |
| min | 17.000000 | 17000.000000 | 4.000000 |
| 25% | 35.000000 | 76077.250000 | 30.000000 |
| 50% | 57.000000 | 110045.000000 | 51.000000 |
| 75% | 78.000000 | 149628.000000 | 75.000000 |
| max | 99.000000 | 189974.000000 | 100.000000 |
d.shape
(1596, 4)
import plotly.express as px
px.pie(values=d['Gender'].value_counts(), names=d['Gender'].value_counts().index)
px.histogram(d, x='Income', marginal='violin')
px.histogram(d, x='Score', marginal='violin')
px.histogram(d, x='Age', marginal='violin')
d = d[['Age', 'Income','Score']]
scaler = StandardScaler()
full_data = scaler.fit_transform(d)
full_data
array([[-1.48843078, -1.76181015, 1.08339647],
[-1.52986694, -0.59611771, -1.69549562],
[-1.40555844, -1.213249 , 0.93518889],
...,
[ 1.24635632, -0.48272555, -1.39908047],
[ 0.83199464, 1.60061827, -1.76959941],
[ 1.37066482, -0.03361396, 0.00889153]])
pca = PCA(n_components=2, random_state=42)
# Fit and transform the data to obtain the 2D projection
data = pca.fit_transform(full_data)
ssd = []
# fit KMeans clustering with different values of k
for k in range(1, 11):
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(full_data)
ssd.append(kmeans.inertia_)
# create a dataframe with the k values and corresponding ssd
d = pd.DataFrame({'k': range(1, 11), 'ssd': ssd})
# create the line plot using Plotly Express
fig = px.line(d, x='k', y='ssd', title='Elbow Method')
fig.update_traces(mode='markers+lines', marker=dict(size=8))
fig.show()
C:\Users\utkar\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning C:\Users\utkar\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. C:\Users\utkar\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning C:\Users\utkar\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. C:\Users\utkar\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning C:\Users\utkar\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. C:\Users\utkar\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning C:\Users\utkar\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. C:\Users\utkar\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning C:\Users\utkar\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. C:\Users\utkar\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning C:\Users\utkar\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. C:\Users\utkar\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning C:\Users\utkar\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. C:\Users\utkar\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning C:\Users\utkar\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. C:\Users\utkar\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning C:\Users\utkar\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7. C:\Users\utkar\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning C:\Users\utkar\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7.
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(full_data)
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
C:\Users\utkar\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning C:\Users\utkar\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=7.
fig = px.scatter(
x=data[:, 0], y=data[:, 1],
color=labels,
size_max=5,
opacity=0.7,
labels={'x':'X', 'y':'Y'},
color_continuous_scale=['black', 'cyan'])
# Add a trace for the cluster centers
import plotly.graph_objs as go
fig.add_trace(
go.Scatter(
x=centroids[:,0],
y=centroids[:,1],
mode='markers+text',
text=['Centroid 1', 'Centroid 2'],
marker=dict(
size=20,
color='orange',
opacity=1.0,
symbol='diamond'
)
)
)
# Update the layout
fig.update_layout(
coloraxis_showscale=False,
title='K Means Clustering Visualization'
)
# Show the plot
fig.show()